Merge pull request #264 from knu/website_agent-faraday

 Add support for using alternative HTTP backends via Faraday

Andrew Cantino 11 years ago
parent
commit
8767e81985
5 changed files with 117 additions and 40 deletions
  1. 7 0
      .env.example
  2. 2 0
      Gemfile
  3. 4 0
      Gemfile.lock
  4. 69 38
      app/models/agents/website_agent.rb
  5. 35 2
      spec/models/agents/website_agent_spec.rb

+ 7 - 0
.env.example

@@ -82,6 +82,13 @@ AWS_SANDBOX=false
82 82
 #   Various Settings   #
83 83
 ########################
84 84
 
85
+# Specify the HTTP backend library for Faraday, used in WebsiteAgent.
86
+# You can change this depending on the performance and stability you
87
+# need for your service.  Any choice other than "typhoeus",
88
+# "net_http", "em_http" should require you to bundle a corresponding
89
+# gem via Gemfile.
90
+FARADAY_HTTP_BACKEND=typhoeus
91
+
85 92
 # Allow JSONPath eval expresions. i.e., $..price[?(@ < 20)]
86 93
 # You should not allow this on a shared Huginn box because it is not secure.
87 94
 ALLOW_JSONPATH_EVAL=false

+ 2 - 0
Gemfile

@@ -33,6 +33,8 @@ gem 'geokit', '~> 1.6.7'
33 33
 gem 'geokit-rails3', '~> 0.1.5'
34 34
 
35 35
 gem 'kramdown', '~> 1.1.0'
36
+gem 'faraday', '~> 0.9.0'
37
+gem 'faraday_middleware'
36 38
 gem 'typhoeus', '~> 0.6.3'
37 39
 gem 'nokogiri', '~> 1.6.0'
38 40
 

+ 4 - 0
Gemfile.lock

@@ -106,6 +106,8 @@ GEM
106 106
     execjs (2.0.2)
107 107
     faraday (0.9.0)
108 108
       multipart-post (>= 1.2, < 3)
109
+    faraday_middleware (0.9.1)
110
+      faraday (>= 0.7.4, < 0.10)
109 111
     ffi (1.9.3)
110 112
     forecast_io (2.0.0)
111 113
       faraday
@@ -316,6 +318,8 @@ DEPENDENCIES
316 318
   devise (~> 3.0.0)
317 319
   dotenv-rails
318 320
   em-http-request (~> 1.1.2)
321
+  faraday (~> 0.9.0)
322
+  faraday_middleware
319 323
   forecast_io (~> 2.0.0)
320 324
   foreman (~> 0.63.0)
321 325
   geokit (~> 1.6.7)

+ 69 - 38
app/models/agents/website_agent.rb

@@ -1,5 +1,6 @@
1 1
 require 'nokogiri'
2
-require 'typhoeus'
2
+require 'faraday'
3
+require 'faraday_middleware'
3 4
 require 'date'
4 5
 
5 6
 module Agents
@@ -21,24 +22,24 @@ module Agents
21 22
 
22 23
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
23 24
 
24
-      When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `'text': true` or `attr` pointing to an attribute name to grab.  An example:
25
+      When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab.  An example:
25 26
 
26
-          'extract': {
27
-            'url': { 'css': "#comic img", 'attr': "src" },
28
-            'title': { 'css': "#comic img", 'attr': "title" },
29
-            'body_text': { 'css': "div.main", 'text': true }
27
+          "extract": {
28
+            "url": { "css": "#comic img", "attr": "src" },
29
+            "title": { "css": "#comic img", "attr": "title" },
30
+            "body_text": { "css": "div.main", "text": true }
30 31
           }
31 32
 
32 33
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
33 34
 
34
-          'extract': {
35
-            'title': { 'path': "results.data[*].title" },
36
-            'description': { 'path': "results.data[*].description" }
35
+          "extract": {
36
+            "title": { "path": "results.data[*].title" },
37
+            "description": { "path": "results.data[*].description" }
37 38
           }
38 39
 
39 40
       Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.  For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
40 41
 
41
-      Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
42
+      Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `"username:password"`, or `["username", "password"]`.
42 43
 
43 44
       Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.  This is only used to set the "working" status.
44 45
 
@@ -46,6 +47,8 @@ module Agents
46 47
 
47 48
       Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset.
48 49
 
50
+      Set `user_agent` to a custom User-Agent name if the website does not like the default value ("Faraday v#{Faraday::VERSION}").
51
+
49 52
       The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload.
50 53
     MD
51 54
 
@@ -103,34 +106,29 @@ module Agents
103 106
           errors.add(:base, "force_encoding must be a string")
104 107
         end
105 108
       end
109
+
110
+      if options['user_agent'].present?
111
+        errors.add(:base, "user_agent must be a string") unless options['user_agent'].is_a?(String)
112
+      end
113
+
114
+      begin
115
+        basic_auth_credentials()
116
+      rescue => e
117
+        errors.add(:base, e.message)
118
+      end
106 119
     end
107 120
 
108 121
     def check
109
-      log "Fetching #{options['url']}"
110 122
       check_url options['url']
111 123
     end
112 124
 
113 125
     def check_url(in_url)
114
-      hydra = Typhoeus::Hydra.new
115
-      request_opts = { :followlocation => true }
116
-      request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
117
-
118
-      requests = []
119
-
120
-      if in_url.kind_of?(Array)
121
-        in_url.each do |url|
122
-           requests.push(Typhoeus::Request.new(url, request_opts))
123
-        end
124
-      else
125
-        requests.push(Typhoeus::Request.new(in_url, request_opts))
126
-      end
127
-
128
-      requests.each do |request|
129
-        request.on_failure do |response|
130
-          error "Failed: #{response.inspect}"
131
-        end
126
+      return unless in_url.present?
132 127
 
133
-        request.on_success do |response|
128
+      Array(in_url).each do |url|
129
+        log "Fetching #{url}"
130
+        response = faraday.get(url)
131
+        if response.success?
134 132
           body = response.body
135 133
           if (encoding = options['force_encoding']).present?
136 134
             body = body.encode(Encoding::UTF_8, encoding)
@@ -155,7 +153,7 @@ module Agents
155 153
                 when xpath = extraction_details['xpath']
156 154
                   nodes = doc.xpath(xpath)
157 155
                 else
158
-                  error "'css' or 'xpath' is required for HTML or XML extraction"
156
+                  error '"css" or "xpath" is required for HTML or XML extraction'
159 157
                   return
160 158
                 end
161 159
                 unless Nokogiri::XML::NodeSet === nodes
@@ -168,7 +166,7 @@ module Agents
168 166
                   elsif extraction_details['text']
169 167
                     node.text()
170 168
                   else
171
-                    error "'attr' or 'text' is required on HTML or XML extraction patterns"
169
+                    error '"attr" or "text" is required on HTML or XML extraction patterns'
172 170
                     return
173 171
                   end
174 172
                 }
@@ -183,14 +181,14 @@ module Agents
183 181
               error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
184 182
               return
185 183
             end
186
-        
184
+
187 185
             old_events = previous_payloads num_unique_lengths.first
188 186
             num_unique_lengths.first.times do |index|
189 187
               result = {}
190 188
               options['extract'].keys.each do |name|
191 189
                 result[name] = output[name][index]
192 190
                 if name.to_s == 'url'
193
-                  result[name] = URI.join(request.base_url, result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
191
+                  result[name] = (response.env[:url] + result[name]).to_s
194 192
                 end
195 193
               end
196 194
 
@@ -200,10 +198,9 @@ module Agents
200 198
               end
201 199
             end
202 200
           end
201
+        else
202
+          error "Failed: #{response.inspect}"
203 203
         end
204
-
205
-        hydra.queue request
206
-        hydra.run
207 204
       end
208 205
     end
209 206
 
@@ -288,6 +285,40 @@ module Agents
288 285
       end
289 286
     end
290 287
 
291
-  end
288
+    def faraday
289
+      @faraday ||= Faraday.new { |builder|
290
+        if (user_agent = options['user_agent']).present?
291
+          builder.headers[:user_agent] = user_agent
292
+        end
292 293
 
294
+        builder.use FaradayMiddleware::FollowRedirects
295
+        builder.request :url_encoded
296
+        if userinfo = basic_auth_credentials()
297
+          builder.request :basic_auth, *userinfo
298
+        end
299
+
300
+        case backend = faraday_backend
301
+        when :typhoeus
302
+          require 'typhoeus/adapters/faraday'
303
+        end
304
+        builder.adapter backend
305
+      }
306
+    end
307
+
308
+    def faraday_backend
309
+      ENV.fetch('FARADAY_HTTP_BACKEND', 'typhoeus').to_sym
310
+    end
311
+
312
+    def basic_auth_credentials
313
+      case value = options['basic_auth']
314
+      when nil, ''
315
+        return nil
316
+      when Array
317
+        return value if value.size == 2
318
+      when /:/
319
+        return value.split(/:/, 2)
320
+      end
321
+      raise "bad value for basic_auth: #{value.inspect}"
322
+    end
323
+  end
293 324
 end

+ 35 - 2
spec/models/agents/website_agent_spec.rb

@@ -348,7 +348,9 @@ describe Agents::WebsiteAgent do
348 348
 
349 349
   describe "checking with http basic auth" do
350 350
     before do
351
-      stub_request(:any, /user:pass/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), :status => 200)
351
+      stub_request(:any, /example/).
352
+        with(headers: { 'Authorization' => "Basic #{['user:pass'].pack('m').chomp}" }).
353
+        to_return(:body => File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), :status => 200)
352 354
       @site = {
353 355
         'name' => "XKCD",
354 356
         'expected_update_period_in_days' => 2,
@@ -374,4 +376,35 @@ describe Agents::WebsiteAgent do
374 376
       end
375 377
     end
376 378
   end
377
-end
379
+
380
+  describe "checking with User-Agent" do
381
+    before do
382
+      stub_request(:any, /example/).
383
+        with(headers: { 'User-Agent' => 'Sushi' }).
384
+        to_return(:body => File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), :status => 200)
385
+      @site = {
386
+        'name' => "XKCD",
387
+        'expected_update_period_in_days' => 2,
388
+        'type' => "html",
389
+        'url' => "http://www.example.com",
390
+        'mode' => 'on_change',
391
+        'extract' => {
392
+          'url' => { 'css' => "#comic img", 'attr' => "src" },
393
+          'title' => { 'css' => "#comic img", 'attr' => "alt" },
394
+          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
395
+        },
396
+        'user_agent' => "Sushi"
397
+      }
398
+      @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @site)
399
+      @checker.user = users(:bob)
400
+      @checker.save!
401
+    end
402
+
403
+    describe "#check" do
404
+      it "should check for changes" do
405
+        lambda { @checker.check }.should change { Event.count }.by(1)
406
+        lambda { @checker.check }.should_not change { Event.count }
407
+      end
408
+    end
409
+  end
410
+end